library(tidyverse)
library(mice)
library(skimr)
library(corrplot)
library(car)
library(ISLR)
library(ggplot2)
library(gridExtra)
library(SamplingStrata)
library(rbin)
library(leaps)
library(dplyr)
library(ggplot2)
library(geosphere)
library(broom)
library(plyr)
library(devtools)
options(scipen=999)

Objective 1:

Question of Interest: what variables are used to predict price of a NYC Airbnb

nycraw <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_1/master/AB_NYC_2019.csv", header = TRUE, strip.white=TRUE)
head(nycraw)
##     id                                             name host_id
## 1 2539               Clean & quiet apt home by the park    2787
## 2 2595                            Skylit Midtown Castle    2845
## 3 3831                  Cozy Entire Floor of Brownstone    4869
## 4 5022 Entire Apt: Spacious Studio/Loft by central park    7192
## 5 5099        Large Cozy 1 BR Apartment In Midtown East    7322
## 6 5121                                  BlissArtsSpace!    7356
##     host_name neighbourhood_group      neighbourhood latitude longitude
## 1        John            Brooklyn         Kensington 40.64749 -73.97237
## 2    Jennifer           Manhattan            Midtown 40.75362 -73.98377
## 3 LisaRoxanne            Brooklyn       Clinton Hill 40.68514 -73.95976
## 4       Laura           Manhattan        East Harlem 40.79851 -73.94399
## 5       Chris           Manhattan        Murray Hill 40.74767 -73.97500
## 6       Garon            Brooklyn Bedford-Stuyvesant 40.68688 -73.95596
##         room_type price minimum_nights number_of_reviews last_review
## 1    Private room   149              1                 9    10/19/18
## 2 Entire home/apt   225              1                45     5/21/19
## 3 Entire home/apt    89              1               270      7/5/19
## 4 Entire home/apt    80             10                 9    11/19/18
## 5 Entire home/apt   200              3                74     6/22/19
## 6    Private room    60             45                49     10/5/17
##   reviews_per_month calculated_host_listings_count availability_365
## 1              0.21                              6              365
## 2              0.38                              2              355
## 3              4.64                              1              194
## 4              0.10                              1                0
## 5              0.59                              1              129
## 6              0.40                              1                0
str(nycraw)
## 'data.frame':    34464 obs. of  16 variables:
##  $ id                            : int  2539 2595 3831 5022 5099 5121 5178 5203 5238 5295 ...
##  $ name                          : Factor w/ 34000 levels ""," Private 1 bdrm Lefferts Gr, BK apt",..: 8990 27105 11178 13776 17775 5839 17808 11099 12643 3931 ...
##  $ host_id                       : int  2787 2845 4869 7192 7322 7356 8967 7490 7549 7702 ...
##  $ host_name                     : Factor w/ 9124 levels "","​ Valéria",..: 4017 3840 4984 4754 1544 2840 7735 5519 958 4836 ...
##  $ neighbourhood_group           : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 2 3 3 2 3 3 3 3 ...
##  $ neighbourhood                 : Factor w/ 218 levels "Allerton","Arden Heights",..: 108 127 42 62 137 14 95 201 36 201 ...
##  $ latitude                      : num  40.6 40.8 40.7 40.8 40.7 ...
##  $ longitude                     : num  -74 -74 -74 -73.9 -74 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 2 1 1 1 1 2 2 2 1 1 ...
##  $ price                         : int  149 225 89 80 200 60 79 79 150 135 ...
##  $ minimum_nights                : int  1 1 1 10 3 45 2 2 1 5 ...
##  $ number_of_reviews             : int  9 45 270 9 74 49 430 118 160 53 ...
##  $ last_review                   : Factor w/ 908 levels "1/1/17","1/1/18",..: 113 575 775 175 671 144 677 745 716 671 ...
##  $ reviews_per_month             : num  0.21 0.38 4.64 0.1 0.59 0.4 3.47 0.99 1.33 0.43 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 4 1 ...
##  $ availability_365              : int  365 355 194 0 129 0 220 0 188 6 ...

EDA to determine type of multiple linear regression to perform

Determined that Times Square is a hot destination point. Used Lat long metrics to build a new metric with distance to Times Square

#Creating a new, tsquare_distance (distance to Times Square in miles)
for (i in 1:nrow(nycraw)) {
  n <- (distHaversine(c(nycraw$latitude[i], -nycraw$longitude[i]), c(40.7580, 73.9855)) / 1609.344)
  nycraw$tsquare_distance[i] <- n
}
head(nycraw)
##     id                                             name host_id
## 1 2539               Clean & quiet apt home by the park    2787
## 2 2595                            Skylit Midtown Castle    2845
## 3 3831                  Cozy Entire Floor of Brownstone    4869
## 4 5022 Entire Apt: Spacious Studio/Loft by central park    7192
## 5 5099        Large Cozy 1 BR Apartment In Midtown East    7322
## 6 5121                                  BlissArtsSpace!    7356
##     host_name neighbourhood_group      neighbourhood latitude longitude
## 1        John            Brooklyn         Kensington 40.64749 -73.97237
## 2    Jennifer           Manhattan            Midtown 40.75362 -73.98377
## 3 LisaRoxanne            Brooklyn       Clinton Hill 40.68514 -73.95976
## 4       Laura           Manhattan        East Harlem 40.79851 -73.94399
## 5       Chris           Manhattan        Murray Hill 40.74767 -73.97500
## 6       Garon            Brooklyn Bedford-Stuyvesant 40.68688 -73.95596
##         room_type price minimum_nights number_of_reviews last_review
## 1    Private room   149              1                 9    10/19/18
## 2 Entire home/apt   225              1                45     5/21/19
## 3 Entire home/apt    89              1               270      7/5/19
## 4 Entire home/apt    80             10                 9    11/19/18
## 5 Entire home/apt   200              3                74     6/22/19
## 6    Private room    60             45                49     10/5/17
##   reviews_per_month calculated_host_listings_count availability_365
## 1              0.21                              6              365
## 2              0.38                              2              355
## 3              4.64                              1              194
## 4              0.10                              1                0
## 5              0.59                              1              129
## 6              0.40                              1                0
##   tsquare_distance
## 1        2.2968748
## 2        0.1459679
## 3        2.2596886
## 4        2.9737754
## 5        0.7525852
## 6        2.4536331

Removing logically irrelevant variables

#Dropping logical irrelevant variables: "id", "name", "host_id", "host_name", "last_review", "latitude", "longitude", "neighborhood"
nyc2 <- select(nycraw, -c("id", "name", "host_id", "host_name", "last_review", "latitude", "longitude", "neighbourhood_group"))
head(nyc2)
##        neighbourhood       room_type price minimum_nights
## 1         Kensington    Private room   149              1
## 2            Midtown Entire home/apt   225              1
## 3       Clinton Hill Entire home/apt    89              1
## 4        East Harlem Entire home/apt    80             10
## 5        Murray Hill Entire home/apt   200              3
## 6 Bedford-Stuyvesant    Private room    60             45
##   number_of_reviews reviews_per_month calculated_host_listings_count
## 1                 9              0.21                              6
## 2                45              0.38                              2
## 3               270              4.64                              1
## 4                 9              0.10                              1
## 5                74              0.59                              1
## 6                49              0.40                              1
##   availability_365 tsquare_distance
## 1              365        2.2968748
## 2              355        0.1459679
## 3              194        2.2596886
## 4                0        2.9737754
## 5              129        0.7525852
## 6                0        2.4536331

Zero Value Variable Check

  • Checking on dependent variable range to make sure if there are zero’s to remove. It would not be free to stay in NYC.
  • Checking on independent variables that have ‘0’ as a value. Seeing that “availability_365” would mean the AirBnB isn’t available at all, we are dropping those from our data set.
nyc2 <- nyc2[!(nyc2$price==0),]
nyc2 <- nyc2[!(nyc2$availability_365==0),]
invisible(view(nyc2))

NA Evaluation and Drop

#Checking for NAs
md.pattern(nyc2)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##       neighbourhood room_type price minimum_nights number_of_reviews
## 25487             1         1     1              1                 1
##                   0         0     0              0                 0
##       reviews_per_month calculated_host_listings_count availability_365
## 25487                 1                              1                1
##                       0                              0                0
##       tsquare_distance  
## 25487                1 0
##                      0 0
nrow(nyc2)
## [1] 25487
#Drop NAs that are present
nyc3 <- na.omit(nyc2)
#Confirming NA drop
nrow(nyc3)
## [1] 25487

Zero variance variable check - all show variance so remain in model

#{r zero variable check} #Results show no zero variance variables, leave in all #skim(nyc3) #

Storing all categorical variables as factors

#Storing categorical variables as factors
skim(nyc3)
## Skim summary statistics
##  n obs: 25487 
##  n variables: 9 
## 
## ── Variable type:factor ──────────────────────────────────────────────────────────────────────────────────────────────────────────
##       variable missing complete     n n_unique
##  neighbourhood       0    25487 25487      217
##      room_type       0    25487 25487        3
##                                  top_counts ordered
##  Bed: 2172, Wil: 1739, Har: 1470, Bus: 1195   FALSE
##     Ent: 13363, Pri: 11472, Sha: 652, NA: 0   FALSE
## 
## ── Variable type:integer ─────────────────────────────────────────────────────────────────────────────────────────────────────────
##                        variable missing complete     n   mean     sd p0
##                availability_365       0    25487 25487 169.11 123.76  1
##  calculated_host_listings_count       0    25487 25487   6.98  32.08  1
##                  minimum_nights       0    25487 25487   6.23  16.8   1
##               number_of_reviews       0    25487 25487  38.94  54.8   1
##                           price       0    25487 25487 146.99 170.37 10
##  p25 p50 p75 p100     hist
##   52 157 290  365 ▇▅▂▃▂▃▃▆
##    1   1   3  327 ▇▁▁▁▁▁▁▁
##    1   2   4  999 ▇▁▁▁▁▁▁▁
##    5  17  51  629 ▇▁▁▁▁▁▁▁
##   69 107 175 8500 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────
##           variable missing complete     n mean   sd    p0  p25  p50  p75
##  reviews_per_month       0    25487 25487 1.85 1.81 0.02  0.49 1.29 2.73
##   tsquare_distance       0    25487 25487 3.44 3.07 0.016 1.48 2.69 4.26
##   p100     hist
##  58.5  ▇▁▁▁▁▁▁▁
##  18.86 ▇▇▂▁▁▁▁▁

Numerical v Numerical Multicollinearity

  • Multicollinearity will weaken the model
    • number_of_reviews and reviews_per_month are correlated at 55%
      • Removing reviews_per_month
corrNYC <- nyc3
#Table numeric variables
corrNYCTable <- corrNYC %>% keep(is.numeric) %>% cor %>% view
#Plot numeric variables v numeric variables
corrNYC %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

invisible(view(corrNYCTable))
#Removing reviews_per_month due to high correlation of is and number_of_reviews
nyc4 <- select(nyc3, -c("reviews_per_month"))

Summary Review of Data Set

summary(nyc4)
##             neighbourhood             room_type         price     
##  Bedford-Stuyvesant: 2172   Entire home/apt:13363   Min.   :  10  
##  Williamsburg      : 1739   Private room   :11472   1st Qu.:  69  
##  Harlem            : 1470   Shared room    :  652   Median : 107  
##  Bushwick          : 1195                           Mean   : 147  
##  Hell's Kitchen    : 1171                           3rd Qu.: 175  
##  Upper East Side   :  867                           Max.   :8500  
##  (Other)           :16873                                         
##  minimum_nights    number_of_reviews calculated_host_listings_count
##  Min.   :  1.000   Min.   :  1.00    Min.   :  1.000               
##  1st Qu.:  1.000   1st Qu.:  5.00    1st Qu.:  1.000               
##  Median :  2.000   Median : 17.00    Median :  1.000               
##  Mean   :  6.233   Mean   : 38.94    Mean   :  6.981               
##  3rd Qu.:  4.000   3rd Qu.: 51.00    3rd Qu.:  3.000               
##  Max.   :999.000   Max.   :629.00    Max.   :327.000               
##                                                                    
##  availability_365 tsquare_distance  
##  Min.   :  1.0    Min.   : 0.01632  
##  1st Qu.: 52.0    1st Qu.: 1.47596  
##  Median :157.0    Median : 2.69428  
##  Mean   :169.1    Mean   : 3.44215  
##  3rd Qu.:290.0    3rd Qu.: 4.25571  
##  Max.   :365.0    Max.   :18.85560  
## 

Changing Price variable range

nyc4 <- filter(nyc4, price >= 25 & price <= 400)

Removing outliers from minimum nights stay

  • Anything over 365 is more than a year and would be improbable
  • Removing any minimum nights metric over 365
nyc4 <- nyc4[!(nyc4$minimum_nights > 365),]
invisible(view(nyc4))

Reviewing Linearity with Numeric Variables

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
#nyc4 %>% pairs() No color model
pairs(nyc4,col=nyc4$neighbourhood) #Color by neighborhood

Creating new Log price variable

  • Based on the above plots we may benefit from a transformation
    • Log transforming price to create a log-linear regression
log.nyc <- nyc4 %>% mutate(lprice=log(price))
log.nyc <- select(log.nyc, -c("price"))
invisible(log.nyc)

Reviewing Linearity with Log-Linear model: Independent and Logged Dependent (Price) Variable

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
pairs(log.nyc,col=log.nyc$neighbourhood)

Log-log model

  • Due to lack of linearity trying to transform the independent variables to see if we can surface a linear relationship
log.indep.nyc <- log.nyc %>% mutate(lreviews=log(number_of_reviews))
log.indep.nyc <- log.indep.nyc %>% mutate(lnights=log(minimum_nights))
log.indep.nyc <- log.indep.nyc %>% mutate(llistings=log(calculated_host_listings_count))
log.indep.nyc <- log.indep.nyc %>% mutate(lavailablility=log(availability_365))
log.indep.nyc <- log.indep.nyc %>% mutate(ltsqr=log(tsquare_distance))
invisible(log.indep.nyc)

log.indep.nyc <- select(log.indep.nyc, -c("minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365", "tsquare_distance"))
# Checking for -inf logged results
invisible(log.indep.nyc)
# Drop -inf log reults in lavailability
log.indep.nyc<-log.indep.nyc[!(log.indep.nyc$lavailablility=="-Inf"),]
invisible(log.indep.nyc)

Reviewing Linearity with Logged Independent and Dependent Variables

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
pairs(log.indep.nyc,col=log.indep.nyc$neighbourhood) #Color by neighborhood

Continuous Variable Bin Manipulation

  • Since we are seeing large clouds of data but no linear trend with logged and unlogged data, we are going to move forward with binning the data to see if it will assist us in determining if there is a relationship between the continuous variables and log price
nyc.bins <- nyc4

nyc.bins$reviewsBin <- var.bin(nyc.bins$number_of_reviews, bins = 50)
nyc.bins$nightsBin <- var.bin(nyc.bins$minimum_nights, bins = 50)
nyc.bins$availBin <- var.bin(nyc.bins$availability_365, bins = 50)
nyc.bins$listBin <- var.bin(nyc.bins$calculated_host_listings_count, bins = 10)
nyc.bins$tsquBin <- var.bin(nyc.bins$tsquare_distance, bins = 20)

nyc.bins <- select(nyc.bins,-c("minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365", "tsquare_distance"))
invisible(nyc.bins)

Reviewing Linearity with Binned Indepedent Variables

  • No linearity is presenting itself with a binned approach of the independent variables
nyc.bin.model <-lm(price~.,data=nyc.bins)
#nyc.bins  %>% pairs() No color model
pairs(nyc.bins,col=nyc.bins$neighbourhood) #Color by neighborhood

Explore potential correlation Neighborhood v Price

  • We have to this moment not be able to surface linearity relationships between our numerican independent varaibles and our dependentt variable
  • Next we will check for correltaion of the categorical variables: room_type & neighbourhood_group
  • We see a strong chance of correlation between Price and Neighbourhood Group
plot(nyc4$neighbourhood, nyc4$price, xlab = "Neighbourhood", ylab = "Price", title = "Price v Neighbourhood Correlation Check", col=c(7,32,52,82,107)) 

  • Narrowing down neighborhoods (randomly) to confirm neighbourhood is a significant categorical metrics to keep in MLR
nycNeighborhood <- ddply(nyc4,.(neighbourhood), function(x) x[sample(nrow(x),1),])

plot(nycNeighborhood$neighbourhood, nycNeighborhood$price, xlab = "Neighbourhood", ylab = "Price", title = "Price v Neighbourhood Correlation Check") 

Explore potential correlation Room Type v Price

  • We see a strong chance of corerlation between Price and Room Type
plot(nyc4$room_type, nyc4$price, xlab = "Room Type", ylab = "Price", title = "Price v Room Type Correlation Check", col=c(7,32,52)) 

Modeling

  • We are not seeing any linear correlation between the dependent and independent numeric varaibles
    • We have tried: linear regression, log-linear regression transformation, log-log regression transformation, binning, and outlier drop.
  • We are seeing a strong linear correlation between the dependent and independent categorical variables
  • We have surfaced the best residuals assumptions matched in a log-linear model
    • Due to this we are moving forward with modeling a log-linear model with singular variables as well as all interaction terms
    • This is to add complexity to our model, we have a low number of varaibles to select from
      • In adding this complexity we are tryign to surface any possible linear variable interations that may contribute to our model
      • If these are surfaced we will go back and use graphical means to verify the model’s discovery
nyc.model = lm(lprice~neighbourhood + room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + availability_365 + tsquare_distance, data=log.nyc)
summary(nyc.model)
## 
## Call:
## lm(formula = lprice ~ neighbourhood + room_type + minimum_nights + 
##     number_of_reviews + calculated_host_listings_count + availability_365 + 
##     tsquare_distance, data = log.nyc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.56065 -0.24179 -0.02474  0.21120  2.16671 
## 
## Coefficients:
##                                           Estimate  Std. Error  t value
## (Intercept)                             5.29397507  0.07801513   67.858
## neighbourhoodArden Heights              0.09987198  0.22240715    0.449
## neighbourhoodArrochar                  -0.23255321  0.10638369   -2.186
## neighbourhoodArverne                    0.68096923  0.08310209    8.194
## neighbourhoodAstoria                   -0.11001920  0.07026454   -1.566
## neighbourhoodBath Beach                -0.42777654  0.12335347   -3.468
## neighbourhoodBattery Park City          0.13620085  0.12153289    1.121
## neighbourhoodBay Ridge                 -0.17395292  0.07984694   -2.179
## neighbourhoodBay Terrace                0.80798639  0.17769392    4.547
## neighbourhoodBaychester                 0.09133318  0.16280354    0.561
## neighbourhoodBayside                    0.50040781  0.09899653    5.055
## neighbourhoodBayswater                  0.61573566  0.13648099    4.512
## neighbourhoodBedford-Stuyvesant        -0.20501068  0.07025206   -2.918
## neighbourhoodBelle Harbor               0.80916632  0.17595985    4.599
## neighbourhoodBellerose                  0.93671670  0.15871634    5.902
## neighbourhoodBelmont                   -0.06285772  0.11485584   -0.547
## neighbourhoodBensonhurst               -0.44614043  0.08884249   -5.022
## neighbourhoodBergen Beach              -0.24524651  0.14536608   -1.687
## neighbourhoodBoerum Hill               -0.03857191  0.08500054   -0.454
## neighbourhoodBorough Park              -0.47358248  0.08167786   -5.798
## neighbourhoodBreezy Point               1.27727475  0.26658042    4.791
## neighbourhoodBriarwood                  0.27061558  0.08944621    3.025
## neighbourhoodBrighton Beach            -0.24723574  0.08646053   -2.860
## neighbourhoodBronxdale                 -0.27859298  0.12383387   -2.250
## neighbourhoodBrooklyn Heights          -0.00732640  0.08812413   -0.083
## neighbourhoodBrownsville               -0.21869688  0.08734324   -2.504
## neighbourhoodBull's Head                0.57481364  0.37184696    1.546
## neighbourhoodBushwick                  -0.14982633  0.06868412   -2.181
## neighbourhoodCambria Heights            0.84997381  0.13401339    6.342
## neighbourhoodCanarsie                  -0.16104793  0.07347689   -2.192
## neighbourhoodCarroll Gardens           -0.06361437  0.08196143   -0.776
## neighbourhoodCastle Hill               -0.30095725  0.26659924   -1.129
## neighbourhoodCastleton Corners          0.34041077  0.26663628    1.277
## neighbourhoodChelsea                    0.10701712  0.07744292    1.382
## neighbourhoodChinatown                 -0.00982561  0.07968895   -0.123
## neighbourhoodCity Island                0.46624325  0.11955657    3.900
## neighbourhoodCivic Center               0.18664760  0.12568212    1.485
## neighbourhoodClaremont Village         -0.26318292  0.11334005   -2.322
## neighbourhoodClason Point               0.06637398  0.10978809    0.605
## neighbourhoodClifton                   -0.10760398  0.12413860   -0.867
## neighbourhoodClinton Hill              -0.06202483  0.07626291   -0.813
## neighbourhoodCo-op City                 0.41572918  0.37157745    1.119
## neighbourhoodCobble Hill                0.00564350  0.09121534    0.062
## neighbourhoodCollege Point              0.02802517  0.12038176    0.233
## neighbourhoodColumbia St               -0.16941662  0.10704526   -1.583
## neighbourhoodConcord                   -0.33584926  0.09905695   -3.390
## neighbourhoodConcourse                 -0.35881350  0.09152397   -3.920
## neighbourhoodConcourse Village         -0.35022239  0.10960180   -3.195
## neighbourhoodConey Island              -0.13913722  0.13053772   -1.066
## neighbourhoodCorona                    -0.14471304  0.08180195   -1.769
## neighbourhoodCrown Heights             -0.18375798  0.07141838   -2.573
## neighbourhoodCypress Hills             -0.14906206  0.07436662   -2.004
## neighbourhoodDitmars Steinway          -0.13840722  0.07247484   -1.910
## neighbourhoodDongan Hills              -0.25209224  0.16274809   -1.549
## neighbourhoodDouglaston                 0.62370932  0.15007817    4.156
## neighbourhoodDowntown Brooklyn          0.05680846  0.09950029    0.571
## neighbourhoodDUMBO                      0.25088636  0.12627734    1.987
## neighbourhoodDyker Heights             -0.31638170  0.13552877   -2.334
## neighbourhoodEast Elmhurst             -0.07234601  0.07159869   -1.010
## neighbourhoodEast Flatbush             -0.31140189  0.07182837   -4.335
## neighbourhoodEast Harlem               -0.00418059  0.07211481   -0.058
## neighbourhoodEast Morrisania            0.06940718  0.13838055    0.502
## neighbourhoodEast New York             -0.11171926  0.07055237   -1.583
## neighbourhoodEast Village              -0.00588189  0.07732182   -0.076
## neighbourhoodEastchester                0.39413202  0.13281691    2.967
## neighbourhoodEdenwald                   0.10073617  0.13825770    0.729
## neighbourhoodEdgemere                   0.45928450  0.14752392    3.113
## neighbourhoodElmhurst                  -0.04586177  0.07154189   -0.641
## neighbourhoodEltingville                0.04928136  0.26709942    0.185
## neighbourhoodEmerson Hill              -0.12533817  0.22091492   -0.567
## neighbourhoodFar Rockaway               0.51293264  0.11336312    4.525
## neighbourhoodFieldston                 -0.16600279  0.15312166   -1.084
## neighbourhoodFinancial District         0.25524041  0.07691610    3.318
## neighbourhoodFlatbush                  -0.36971376  0.07437810   -4.971
## neighbourhoodFlatiron District          0.22763395  0.10339020    2.202
## neighbourhoodFlatlands                 -0.20682051  0.08335050   -2.481
## neighbourhoodFlushing                   0.29539692  0.06890875    4.287
## neighbourhoodFordham                   -0.16652913  0.08704222   -1.913
## neighbourhoodForest Hills               0.27143346  0.07643973    3.551
## neighbourhoodFort Greene               -0.05824557  0.07785717   -0.748
## neighbourhoodFort Hamilton             -0.23764506  0.09302927   -2.555
## neighbourhoodFresh Meadows              0.39764141  0.09916676    4.010
## neighbourhoodGlendale                  -0.23666011  0.08838692   -2.678
## neighbourhoodGowanus                    0.01483654  0.08053105    0.184
## neighbourhoodGramercy                  -0.00140382  0.08254005   -0.017
## neighbourhoodGraniteville               0.30003749  0.37186462    0.807
## neighbourhoodGrant City                -0.45547820  0.19396395   -2.348
## neighbourhoodGravesend                 -0.38418630  0.09634785   -3.987
## neighbourhoodGreat Kills                0.37189977  0.13318659    2.792
## neighbourhoodGreenpoint                -0.05165686  0.07381696   -0.700
## neighbourhoodGreenwich Village          0.15246521  0.08091642    1.884
## neighbourhoodGrymes Hill                0.27831000  0.17596886    1.582
## neighbourhoodHarlem                    -0.09270577  0.07179948   -1.291
## neighbourhoodHell's Kitchen             0.06658374  0.07754466    0.859
## neighbourhoodHighbridge                -0.26831268  0.10523732   -2.550
## neighbourhoodHollis                     0.54822516  0.14117800    3.883
## neighbourhoodHolliswood                 1.38307509  0.26823307    5.156
## neighbourhoodHoward Beach               0.28300020  0.12788561    2.213
## neighbourhoodHowland Hook               0.21152650  0.26736772    0.791
## neighbourhoodHuguenot                   0.35335612  0.22196275    1.592
## neighbourhoodHunts Point               -0.35405299  0.11496698   -3.080
## neighbourhoodInwood                    -0.17930922  0.07538338   -2.379
## neighbourhoodJackson Heights           -0.05959317  0.07238098   -0.823
## neighbourhoodJamaica                    0.43299155  0.07427419    5.830
## neighbourhoodJamaica Estates            0.47652848  0.11919496    3.998
## neighbourhoodJamaica Hills              0.76382679  0.17694090    4.317
## neighbourhoodKensington                -0.43337179  0.08298369   -5.222
## neighbourhoodKew Gardens                0.20439973  0.10467297    1.953
## neighbourhoodKew Gardens Hills          0.30063359  0.11267107    2.668
## neighbourhoodKingsbridge               -0.16574917  0.08600170   -1.927
## neighbourhoodKips Bay                  -0.06294762  0.08138423   -0.773
## neighbourhoodLaurelton                  0.56760503  0.12303683    4.613
## neighbourhoodLighthouse Hill            0.46386216  0.26674128    1.739
## neighbourhoodLittle Italy              -0.02633816  0.08869225   -0.297
## neighbourhoodLittle Neck                0.56431025  0.22518590    2.506
## neighbourhoodLong Island City          -0.06806990  0.07357323   -0.925
## neighbourhoodLongwood                  -0.25730909  0.09011566   -2.855
## neighbourhoodLower East Side           -0.01866631  0.07784732   -0.240
## neighbourhoodManhattan Beach           -0.30401950  0.16423386   -1.851
## neighbourhoodMarble Hill                0.10155905  0.17659767    0.575
## neighbourhoodMariners Harbor            0.20906207  0.15344361    1.362
## neighbourhoodMaspeth                   -0.20199637  0.07848588   -2.574
## neighbourhoodMelrose                   -0.13567952  0.19500991   -0.696
## neighbourhoodMiddle Village            -0.01050842  0.09792488   -0.107
## neighbourhoodMidland Beach             -0.03755757  0.19399870   -0.194
## neighbourhoodMidtown                    0.13637112  0.07758783    1.758
## neighbourhoodMidwood                   -0.37380059  0.08709251   -4.292
## neighbourhoodMill Basin                 0.03554244  0.19468997    0.183
## neighbourhoodMorningside Heights       -0.04083690  0.08174499   -0.500
## neighbourhoodMorris Heights            -0.48426244  0.13376484   -3.620
## neighbourhoodMorris Park                0.05729539  0.12381834    0.463
## neighbourhoodMorrisania                -0.17215617  0.16349160   -1.053
## neighbourhoodMott Haven                -0.28525263  0.08637318   -3.303
## neighbourhoodMount Eden                -0.51464895  0.26724219   -1.926
## neighbourhoodMount Hope                -0.28228600  0.11545008   -2.445
## neighbourhoodMurray Hill                0.04985858  0.08060353    0.619
## neighbourhoodNavy Yard                  0.19356895  0.17972430    1.077
## neighbourhoodNeponsit                   0.90913355  0.22086458    4.116
## neighbourhoodNew Brighton               0.19804521  0.17597314    1.125
## neighbourhoodNew Dorp Beach            -0.38802742  0.26656938   -1.456
## neighbourhoodNew Springville            0.23133183  0.17664842    1.310
## neighbourhoodNoHo                       0.19601719  0.09970690    1.966
## neighbourhoodNolita                     0.17857368  0.08396976    2.127
## neighbourhoodNorth Riverdale           -0.37755865  0.17618941   -2.143
## neighbourhoodNorwood                   -0.12395241  0.10996453   -1.127
## neighbourhoodOakwood                    0.15897217  0.19405151    0.819
## neighbourhoodOlinville                 -0.01247849  0.26667515   -0.047
## neighbourhoodOzone Park                 0.03494678  0.08588990    0.407
## neighbourhoodPark Slope                -0.02027421  0.07750808   -0.262
## neighbourhoodParkchester               -0.22584648  0.09880091   -2.286
## neighbourhoodPelham Bay                 0.22376378  0.12411923    1.803
## neighbourhoodPelham Gardens            -0.16120645  0.10012988   -1.610
## neighbourhoodPort Morris               -0.33120778  0.09388403   -3.528
## neighbourhoodPort Richmond             -0.07532229  0.17607534   -0.428
## neighbourhoodPrince's Bay               0.33919691  0.37313543    0.909
## neighbourhoodProspect Heights          -0.07839375  0.07865639   -0.997
## neighbourhoodProspect-Lefferts Gardens -0.30645995  0.07442523   -4.118
## neighbourhoodQueens Village             0.42745785  0.09236844    4.628
## neighbourhoodRandall Manor             -0.03507587  0.11453640   -0.306
## neighbourhoodRed Hook                  -0.21088252  0.09011473   -2.340
## neighbourhoodRego Park                  0.00196639  0.07772904    0.025
## neighbourhoodRichmond Hill              0.17148839  0.07839775    2.187
## neighbourhoodRichmondtown              -0.26236675  0.37145885   -0.706
## neighbourhoodRidgewood                 -0.15562456  0.07136032   -2.181
## neighbourhoodRiverdale                 -0.02734692  0.19463010   -0.141
## neighbourhoodRockaway Beach             0.47908270  0.08876348    5.397
## neighbourhoodRoosevelt Island          -0.02780919  0.09810800   -0.283
## neighbourhoodRosebank                  -0.05542293  0.16311375   -0.340
## neighbourhoodRosedale                   0.61696146  0.09353426    6.596
## neighbourhoodRossville                  0.20771147  0.37301040    0.557
## neighbourhoodSchuylerville              0.20848551  0.12807724    1.628
## neighbourhoodSea Gate                  -0.21239655  0.37223131   -0.571
## neighbourhoodSheepshead Bay            -0.29633511  0.07808217   -3.795
## neighbourhoodShore Acres               -0.59098644  0.26690913   -2.214
## neighbourhoodSilver Lake               -0.36388803  0.37144089   -0.980
## neighbourhoodSoHo                       0.21020883  0.08007299    2.625
## neighbourhoodSoundview                 -0.31080479  0.12787100   -2.431
## neighbourhoodSouth Beach                0.16900454  0.15275022    1.106
## neighbourhoodSouth Ozone Park           0.36039755  0.09063935    3.976
## neighbourhoodSouth Slope               -0.09549537  0.07925784   -1.205
## neighbourhoodSpringfield Gardens        0.64738238  0.08374254    7.731
## neighbourhoodSpuyten Duyvil             0.16900438  0.22153244    0.763
## neighbourhoodSt. Albans                 0.50093184  0.08598072    5.826
## neighbourhoodSt. George                 0.00422551  0.09123080    0.046
## neighbourhoodStapleton                 -0.07394654  0.10181800   -0.726
## neighbourhoodStuyvesant Town            0.00114719  0.13403485    0.009
## neighbourhoodSunnyside                 -0.17555840  0.07291657   -2.408
## neighbourhoodSunset Park               -0.33236749  0.07642401   -4.349
## neighbourhoodTheater District           0.15660269  0.08524855    1.837
## neighbourhoodThrogs Neck                0.30797460  0.10490571    2.936
## neighbourhoodTodt Hill                 -0.07923354  0.22084688   -0.359
## neighbourhoodTompkinsville             -0.15580093  0.08979337   -1.735
## neighbourhoodTottenville                0.83540431  0.18145541    4.604
## neighbourhoodTremont                   -0.38630543  0.15307095   -2.524
## neighbourhoodTribeca                    0.44374720  0.09064056    4.896
## neighbourhoodTwo Bridges               -0.02179565  0.09307978   -0.234
## neighbourhoodUnionport                 -0.03153795  0.19396732   -0.163
## neighbourhoodUniversity Heights        -0.32165755  0.12149019   -2.648
## neighbourhoodUpper East Side            0.03259793  0.07377120    0.442
## neighbourhoodUpper West Side            0.03505626  0.07635370    0.459
## neighbourhoodVan Nest                  -0.16375494  0.16272185   -1.006
## neighbourhoodVinegar Hill               0.06368466  0.12064051    0.528
## neighbourhoodWakefield                 -0.02045184  0.08889643   -0.230
## neighbourhoodWashington Heights        -0.22382717  0.07168248   -3.122
## neighbourhoodWest Brighton             -0.02249759  0.11201009   -0.201
## neighbourhoodWest Farms                 0.25006931  0.26665649    0.938
## neighbourhoodWest Village               0.17456643  0.07738087    2.256
## neighbourhoodWestchester Square        -0.16718585  0.15265050   -1.095
## neighbourhoodWesterleigh               -0.03621837  0.26669131   -0.136
## neighbourhoodWhitestone                 0.37701584  0.14505343    2.599
## neighbourhoodWilliamsbridge            -0.02764262  0.09379034   -0.295
## neighbourhoodWilliamsburg               0.01165297  0.07244470    0.161
## neighbourhoodWillowbrook                0.87549876  0.37150376    2.357
## neighbourhoodWindsor Terrace           -0.24014716  0.08353565   -2.875
## neighbourhoodWoodhaven                 -0.06630459  0.07940355   -0.835
## neighbourhoodWoodlawn                  -0.27003596  0.13800307   -1.957
## neighbourhoodWoodside                  -0.11392214  0.07403831   -1.539
## room_typePrivate room                  -0.68971493  0.00499512 -138.078
## room_typeShared room                   -1.13438114  0.01543741  -73.483
## minimum_nights                         -0.00480495  0.00017622  -27.267
## number_of_reviews                      -0.00044760  0.00004349  -10.292
## calculated_host_listings_count         -0.00029200  0.00009529   -3.064
## availability_365                        0.00044007  0.00001981   22.219
## tsquare_distance                       -0.07393171  0.00485827  -15.218
##                                                    Pr(>|t|)    
## (Intercept)                            < 0.0000000000000002 ***
## neighbourhoodArden Heights                         0.653399    
## neighbourhoodArrochar                              0.028826 *  
## neighbourhoodArverne                   0.000000000000000264 ***
## neighbourhoodAstoria                               0.117412    
## neighbourhoodBath Beach                            0.000525 ***
## neighbourhoodBattery Park City                     0.262430    
## neighbourhoodBay Ridge                             0.029372 *  
## neighbourhoodBay Terrace               0.000005466066201576 ***
## neighbourhoodBaychester                            0.574801    
## neighbourhoodBayside                   0.000000433963238269 ***
## neighbourhoodBayswater                 0.000006466826102711 ***
## neighbourhoodBedford-Stuyvesant                    0.003524 ** 
## neighbourhoodBelle Harbor              0.000004275124927277 ***
## neighbourhoodBellerose                 0.000000003642604284 ***
## neighbourhoodBelmont                               0.584195    
## neighbourhoodBensonhurst               0.000000515778980536 ***
## neighbourhoodBergen Beach                          0.091598 .  
## neighbourhoodBoerum Hill                           0.649988    
## neighbourhoodBorough Park              0.000000006787005322 ***
## neighbourhoodBreezy Point              0.000001666565680574 ***
## neighbourhoodBriarwood                             0.002485 ** 
## neighbourhoodBrighton Beach                        0.004246 ** 
## neighbourhoodBronxdale                             0.024475 *  
## neighbourhoodBrooklyn Heights                      0.933743    
## neighbourhoodBrownsville                           0.012291 *  
## neighbourhoodBull's Head                           0.122158    
## neighbourhoodBushwick                              0.029165 *  
## neighbourhoodCambria Heights           0.000000000230111765 ***
## neighbourhoodCanarsie                              0.028402 *  
## neighbourhoodCarroll Gardens                       0.437668    
## neighbourhoodCastle Hill                           0.258962    
## neighbourhoodCastleton Corners                     0.201725    
## neighbourhoodChelsea                               0.167020    
## neighbourhoodChinatown                             0.901871    
## neighbourhoodCity Island               0.000096542398726438 ***
## neighbourhoodCivic Center                          0.137536    
## neighbourhoodClaremont Village                     0.020238 *  
## neighbourhoodClason Point                          0.545474    
## neighbourhoodClifton                               0.386057    
## neighbourhoodClinton Hill                          0.416052    
## neighbourhoodCo-op City                            0.263227    
## neighbourhoodCobble Hill                           0.950667    
## neighbourhoodCollege Point                         0.815917    
## neighbourhoodColumbia St                           0.113511    
## neighbourhoodConcord                               0.000699 ***
## neighbourhoodConcourse                 0.000088632696018338 ***
## neighbourhoodConcourse Village                     0.001398 ** 
## neighbourhoodConey Island                          0.286490    
## neighbourhoodCorona                                0.076895 .  
## neighbourhoodCrown Heights                         0.010089 *  
## neighbourhoodCypress Hills                         0.045036 *  
## neighbourhoodDitmars Steinway                      0.056180 .  
## neighbourhoodDongan Hills                          0.121401    
## neighbourhoodDouglaston                0.000032512275993055 ***
## neighbourhoodDowntown Brooklyn                     0.568047    
## neighbourhoodDUMBO                                 0.046957 *  
## neighbourhoodDyker Heights                         0.019582 *  
## neighbourhoodEast Elmhurst                         0.312296    
## neighbourhoodEast Flatbush             0.000014610597310590 ***
## neighbourhoodEast Harlem                           0.953772    
## neighbourhoodEast Morrisania                       0.615976    
## neighbourhoodEast New York                         0.113322    
## neighbourhoodEast Village                          0.939364    
## neighbourhoodEastchester                           0.003005 ** 
## neighbourhoodEdenwald                              0.466246    
## neighbourhoodEdgemere                              0.001852 ** 
## neighbourhoodElmhurst                              0.521498    
## neighbourhoodEltingville                           0.853618    
## neighbourhoodEmerson Hill                          0.570475    
## neighbourhoodFar Rockaway              0.000006077113828277 ***
## neighbourhoodFieldston                             0.278321    
## neighbourhoodFinancial District                    0.000907 ***
## neighbourhoodFlatbush                  0.000000671532545171 ***
## neighbourhoodFlatiron District                     0.027696 *  
## neighbourhoodFlatlands                             0.013096 *  
## neighbourhoodFlushing                  0.000018197596803125 ***
## neighbourhoodFordham                               0.055734 .  
## neighbourhoodForest Hills                          0.000385 ***
## neighbourhoodFort Greene                           0.454402    
## neighbourhoodFort Hamilton                         0.010639 *  
## neighbourhoodFresh Meadows             0.000060945081874284 ***
## neighbourhoodGlendale                              0.007421 ** 
## neighbourhoodGowanus                               0.853832    
## neighbourhoodGramercy                              0.986431    
## neighbourhoodGraniteville                          0.419763    
## neighbourhoodGrant City                            0.018869 *  
## neighbourhoodGravesend                 0.000066970587129689 ***
## neighbourhoodGreat Kills                           0.005237 ** 
## neighbourhoodGreenpoint                            0.484061    
## neighbourhoodGreenwich Village                     0.059546 .  
## neighbourhoodGrymes Hill                           0.113757    
## neighbourhoodHarlem                                0.196655    
## neighbourhoodHell's Kitchen                        0.390542    
## neighbourhoodHighbridge                            0.010791 *  
## neighbourhoodHollis                                0.000103 ***
## neighbourhoodHolliswood                0.000000253926229845 ***
## neighbourhoodHoward Beach                          0.026913 *  
## neighbourhoodHowland Hook                          0.428867    
## neighbourhoodHuguenot                              0.111406    
## neighbourhoodHunts Point                           0.002075 ** 
## neighbourhoodInwood                                0.017385 *  
## neighbourhoodJackson Heights                       0.410331    
## neighbourhoodJamaica                   0.000000005624995484 ***
## neighbourhoodJamaica Estates           0.000064097939546765 ***
## neighbourhoodJamaica Hills             0.000015889934877878 ***
## neighbourhoodKensington                0.000000178097996278 ***
## neighbourhoodKew Gardens                           0.050861 .  
## neighbourhoodKew Gardens Hills                     0.007630 ** 
## neighbourhoodKingsbridge                           0.053957 .  
## neighbourhoodKips Bay                              0.439256    
## neighbourhoodLaurelton                 0.000003983579965643 ***
## neighbourhoodLighthouse Hill                       0.082048 .  
## neighbourhoodLittle Italy                          0.766499    
## neighbourhoodLittle Neck                           0.012218 *  
## neighbourhoodLong Island City                      0.354871    
## neighbourhoodLongwood                              0.004303 ** 
## neighbourhoodLower East Side                       0.810502    
## neighbourhoodManhattan Beach                       0.064162 .  
## neighbourhoodMarble Hill                           0.565238    
## neighbourhoodMariners Harbor                       0.173063    
## neighbourhoodMaspeth                               0.010069 *  
## neighbourhoodMelrose                               0.486588    
## neighbourhoodMiddle Village                        0.914543    
## neighbourhoodMidland Beach                         0.846493    
## neighbourhoodMidtown                               0.078822 .  
## neighbourhoodMidwood                   0.000017775984886379 ***
## neighbourhoodMill Basin                            0.855145    
## neighbourhoodMorningside Heights                   0.617386    
## neighbourhoodMorris Heights                        0.000295 ***
## neighbourhoodMorris Park                           0.643557    
## neighbourhoodMorrisania                            0.292353    
## neighbourhoodMott Haven                            0.000959 ***
## neighbourhoodMount Eden                            0.054144 .  
## neighbourhoodMount Hope                            0.014489 *  
## neighbourhoodMurray Hill                           0.536208    
## neighbourhoodNavy Yard                             0.281476    
## neighbourhoodNeponsit                  0.000038635847207790 ***
## neighbourhoodNew Brighton                          0.260419    
## neighbourhoodNew Dorp Beach                        0.145507    
## neighbourhoodNew Springville                       0.190357    
## neighbourhoodNoHo                                  0.049318 *  
## neighbourhoodNolita                                0.033460 *  
## neighbourhoodNorth Riverdale                       0.032130 *  
## neighbourhoodNorwood                               0.259668    
## neighbourhoodOakwood                               0.412665    
## neighbourhoodOlinville                             0.962679    
## neighbourhoodOzone Park                            0.684101    
## neighbourhoodPark Slope                            0.793651    
## neighbourhoodParkchester                           0.022270 *  
## neighbourhoodPelham Bay                            0.071430 .  
## neighbourhoodPelham Gardens                        0.107417    
## neighbourhoodPort Morris                           0.000420 ***
## neighbourhoodPort Richmond                         0.668812    
## neighbourhoodPrince's Bay                          0.363335    
## neighbourhoodProspect Heights                      0.318939    
## neighbourhoodProspect-Lefferts Gardens 0.000038395384225151 ***
## neighbourhoodQueens Village            0.000003715710278657 ***
## neighbourhoodRandall Manor                         0.759423    
## neighbourhoodRed Hook                              0.019284 *  
## neighbourhoodRego Park                             0.979817    
## neighbourhoodRichmond Hill                         0.028722 *  
## neighbourhoodRichmondtown                          0.479999    
## neighbourhoodRidgewood                             0.029206 *  
## neighbourhoodRiverdale                             0.888260    
## neighbourhoodRockaway Beach            0.000000068285845369 ***
## neighbourhoodRoosevelt Island                      0.776831    
## neighbourhoodRosebank                              0.734025    
## neighbourhoodRosedale                  0.000000000043077969 ***
## neighbourhoodRossville                             0.577634    
## neighbourhoodSchuylerville                         0.103578    
## neighbourhoodSea Gate                              0.568274    
## neighbourhoodSheepshead Bay                        0.000148 ***
## neighbourhoodShore Acres                           0.026825 *  
## neighbourhoodSilver Lake                           0.327261    
## neighbourhoodSoHo                                  0.008665 ** 
## neighbourhoodSoundview                             0.015081 *  
## neighbourhoodSouth Beach                           0.268560    
## neighbourhoodSouth Ozone Park          0.000070236305106912 ***
## neighbourhoodSouth Slope                           0.228265    
## neighbourhoodSpringfield Gardens       0.000000000000011115 ***
## neighbourhoodSpuyten Duyvil                        0.445538    
## neighbourhoodSt. Albans                0.000000005745456556 ***
## neighbourhoodSt. George                            0.963058    
## neighbourhoodStapleton                             0.467685    
## neighbourhoodStuyvesant Town                       0.993171    
## neighbourhoodSunnyside                             0.016062 *  
## neighbourhoodSunset Park               0.000013732037801157 ***
## neighbourhoodTheater District                      0.066220 .  
## neighbourhoodThrogs Neck                           0.003331 ** 
## neighbourhoodTodt Hill                             0.719769    
## neighbourhoodTompkinsville                         0.082735 .  
## neighbourhoodTottenville               0.000004167298214400 ***
## neighbourhoodTremont                               0.011619 *  
## neighbourhoodTribeca                   0.000000985940162644 ***
## neighbourhoodTwo Bridges                           0.814862    
## neighbourhoodUnionport                             0.870839    
## neighbourhoodUniversity Heights                    0.008112 ** 
## neighbourhoodUpper East Side                       0.658581    
## neighbourhoodUpper West Side                       0.646145    
## neighbourhoodVan Nest                              0.314258    
## neighbourhoodVinegar Hill                          0.597582    
## neighbourhoodWakefield                             0.818044    
## neighbourhoodWashington Heights                    0.001795 ** 
## neighbourhoodWest Brighton                         0.840815    
## neighbourhoodWest Farms                            0.348359    
## neighbourhoodWest Village                          0.024083 *  
## neighbourhoodWestchester Square                    0.273431    
## neighbourhoodWesterleigh                           0.891975    
## neighbourhoodWhitestone                            0.009351 ** 
## neighbourhoodWilliamsbridge                        0.768204    
## neighbourhoodWilliamsburg                          0.872210    
## neighbourhoodWillowbrook                           0.018449 *  
## neighbourhoodWindsor Terrace                       0.004047 ** 
## neighbourhoodWoodhaven                             0.403707    
## neighbourhoodWoodlawn                              0.050390 .  
## neighbourhoodWoodside                              0.123893    
## room_typePrivate room                  < 0.0000000000000002 ***
## room_typeShared room                   < 0.0000000000000002 ***
## minimum_nights                         < 0.0000000000000002 ***
## number_of_reviews                      < 0.0000000000000002 ***
## calculated_host_listings_count                     0.002185 ** 
## availability_365                       < 0.0000000000000002 ***
## tsquare_distance                       < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3657 on 24312 degrees of freedom
## Multiple R-squared:  0.6269, Adjusted R-squared:  0.6235 
## F-statistic: 183.2 on 223 and 24312 DF,  p-value: < 0.00000000000000022
#Surfacing only significant neighborhoods
nyc.model2 = tidy(nyc.model)
options(scipen = 999)
invisible(nyc.model2)
nyc.modeldf <- nyc.model2[nyc.model2$p.value < 0.05,]
invisible(view(nyc.modeldf))
#would reducing the data set to just sig values help?

Model selection attempts

#To be built and provided by Reagan, there should include about 4 or so intuitive models and we take the one with the best Adj rsquared
nyc.fwd = regsubsets(lprice~neighbourhood + room_type + neighbourhood:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + availability_365 + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count + availability_365:minimum_nights + availability_365:number_of_reviews + availability_365:calculated_host_listings_count + tsquare_distance:minimum_nights + tsquare_distance:number_of_reviews + tsquare_distance:calculated_host_listings_count + tsquare_distance:availability_365, method = "forward", data=log.nyc, nvmax=20)
## Reordering variables and trying again:
summary(nyc.fwd)$adjr2
##  [1] 0.3636217 0.4584526 0.4814209 0.5021979 0.5114939 0.5196637 0.5260967
##  [8] 0.5317874 0.5362072 0.5406042 0.5496958 0.5525292 0.5553844 0.5582046
## [15] 0.5607291 0.5632855 0.5654550 0.5677430 0.5701914 0.5726802 0.5750815
#predict.regsubsets =function (object , newdata ,id ,...){
#  form=as.formula (object$call [[2]])
#  mat=model.matrix(form ,newdata )
#  coefi=coef(object ,id=id)
#  xvars=names(coefi)
#  mat[,xvars]%*%coefi
#}

#testASE<-c()
#note my index is to 20 since that what I set it in regsubsets
#for (i in 1:20){
#  predictions<-predict.regsubsets(object=reg.fwd,newdata=test,id=i) 
#  testASE[i]<-mean((log(test$AvgWinnings)-predictions)^2)
#}

#par(mfrow=c(1,1))
#plot(1:20,testASE,type="l",xlab="# of predictors",ylab="test vs train ASE",ylim=c(0.2,0.8))

#index<-which(testASE==min(testASE))
#points(index,testASE[index],col="red",pch=10)
#rss<-summary(reg.fwd)$rss
#lines(1:20,rss/100,lty=3,col="blue")  #Dividing by 100 since ASE=RSS/sample size

#coef(reg.final,XXXX)

#final.model<-lm(log(AvgWinnings)~Greens+AvgPutts+Save,data=golf)
#summary(final.model)
nyc.bck = regsubsets(lprice~neighbourhood + room_type + neighbourhood:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + availability_365 + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count + availability_365:minimum_nights + availability_365:number_of_reviews + availability_365:calculated_host_listings_count + tsquare_distance:minimum_nights + tsquare_distance:number_of_reviews + tsquare_distance:calculated_host_listings_count + tsquare_distance:availability_365, method="backward", data=log.nyc, nvmax=20)
## Reordering variables and trying again:
summary(nyc.bck)$adjr2
##  [1] 0.3636217 0.4584526 0.4814209 0.5021979 0.5114939 0.5183706 0.5241404
##  [8] 0.5286630 0.5333358 0.5378035 0.5423879 0.5472786 0.5521801 0.5566577
## [15] 0.5604016 0.5637242 0.5665884 0.5693934 0.5721132 0.5746263 0.5770165
#summary(nyc.bck)$rss
#summary(nyc.bck)$bic
nyc.seq = regsubsets(lprice~neighbourhood + room_type + neighbourhood:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + availability_365 + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count + availability_365:minimum_nights + availability_365:number_of_reviews + availability_365:calculated_host_listings_count + tsquare_distance:minimum_nights + tsquare_distance:number_of_reviews + tsquare_distance:calculated_host_listings_count + tsquare_distance:availability_365, method="seqrep", data=log.nyc, nvmax=20)
## Reordering variables and trying again:
summary(nyc.seq)$adjr2
##  [1] 0.3636217 0.4584526 0.4814209 0.5021979 0.5114939 0.5196637 0.5260967
##  [8] 0.5317874 0.5362072 0.5488486 0.5517418 0.5546502 0.5574035 0.5609530
## [15] 0.5642190 0.5673411 0.5697029 0.5721765 0.5746351 0.5769443 0.5792610
#summary(nyc.seq)$rss
#summary(nyc.seq)$bic
#Due to an exhaustive method being too large. We are commenting out this model and moving forward with selective methods that will run
#nyc.exh = regsubsets(lprice~neighbourhood + room_type + neighbourhood:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + availability_365 + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count + availability_365:minimum_nights + availability_365:number_of_reviews + availability_365:calculated_host_listings_count + tsquare_distance:minimum_nights + tsquare_distance:number_of_reviews + tsquare_distance:calculated_host_listings_count + tsquare_distance:availability_365, data=log.nyc, nvmax=20)
#summary(nyc.exh)$adjr2
#summary(nyc.exh)$rss
#summary(nyc.exh)$bic

Assumptions Check on Intuitive Model

  • Residuals near normally distributed but still skewed
  • Envoking Central Limit Theorum due to such a large sample size
  • Constant Variance
    • The QQ Plot shows an extreme deviation from normality. Even with the Central Limit Theorum we do not feel comforablte moving forward.
par(mfrow=c(2,2))
full.model<-lm(price~.,data=nyc4)
plot(full.model)

Assumptions Check on Log-Linear Intuitive Model

  • Risduals are near normally distributed with them being slightly off
    • Envoking Central Limit Theorum due to such a large sample size
    • Passed
  • Constant variance
    • The QQ-plot is showing much less departure from normality
    • Passed
par(mfrow=c(2,2))
log.depend.model<-lm(lprice~.,data=log.nyc)
plot(log.depend.model)

  • Independence
    • Assumed
    • Passed
  • Multicollinearity
    • We are seeing a high VIF for our distance to Times Square
vif(log.depend.model)[,3]^2
##                  neighbourhood                      room_type 
##                       1.020103                       1.080610 
##                 minimum_nights              number_of_reviews 
##                       1.083880                       1.061315 
## calculated_host_listings_count               availability_365 
##                       1.547333                       1.101061 
##               tsquare_distance 
##                      41.098477

MLR May Not Be The Best

  • Multiple linear regression is just one option in building a predictive model for a continuous response
  • We are seeing it as a bad option because
    • The true relationship between the response and predictors is NOT “linear”. The relationships are complex.
      • We have gotten close, but we have worked extremely hard in specifying our model and manipulating the raw data to surface a linear relationship
      • This makes the interpretation into the real world application difficult to interpret
    • Since the above is true and our data is very large, we think that other methods such as Random Forest or K-NN would perform better.
      • These options are less time consuming because the model complexity is built into the lagorithm
      • We also do not have to specify how a relationship exists ahead of time
  • Since we see a strong relationship between the categorical variables, we move forward with a Two-Way ANOVA model to create a model way predict the price of a NYC AirBnB.